The dataset was retrieved from Kaggle (https://www.kaggle.com/adhok93/presidentialaddress/home) and contains all 58 inauguration speeches of presidents from the George Washington’s first address to present day.

Donald Trump (above) proving his eyes were stronger than the sun.

Import Data

inaug_speeches <- read.csv("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks/inaug_speeches.csv", encoding = "ASCII")




Install Necessary Packages

##install needed packages
#install.packages(c("psych", "data.table", "stringr", "lubridate","tidyr", "dplyr", "tidytext", 
    ##"ggplot2", "RWeka", "quanteda", "tm","wordcloud", "lsa", "ggcorrplot", "plotly", "rJava", "NLP", 
    ##"openNLP","openNLPmodels.en", "reshape", "tibble","googleVis", "magrittr", "RColorBrewer"))

1. Exploratory Data Analysis (EDA)

##initial exploration
str(inaug_speeches)
## 'data.frame':    58 obs. of  6 variables:
##  $ X                : int  4 5 6 7 8 9 10 11 12 13 ...
##  $ Name             : Factor w/ 39 levels "Abraham Lincoln",..: 13 13 23 32 32 20 20 21 21 25 ...
##  $ Inaugural.Address: Factor w/ 5 levels "First Inaugural Address",..: 1 4 3 1 4 1 4 1 4 3 ...
##  $ Date             : Factor w/ 58 levels "20-Jan-97","Friday, January 20, 1961",..: 35 14 29 54 15 30 39 47 22 5 ...
##  $ text             : Factor w/ 58 levels "              ABOUT to add the solemnity of an oath to the obligations imposed by a second call to the station "| __truncated__,..: 58 42 26 43 18 23 1 6 31 7 ...
##  $ text2            : Factor w/ 2 levels ""," in some cases  as the powers which they respectively claim are often not defined by any distinct lines. Mischi"| __truncated__: 1 1 1 1 1 1 1 1 1 1 ...
inaug_speeches_dt <- as.data.table(inaug_speeches)
names(inaug_speeches)
## [1] "X"                 "Name"              "Inaugural.Address"
## [4] "Date"              "text"              "text2"
##count unique values in each column
apply(inaug_speeches[, c(1:ncol(inaug_speeches))], 2, function(x) length(unique(x)))
##                 X              Name Inaugural.Address              Date 
##                58                39                 5                58 
##              text             text2 
##                58                 2
##counts of each unique president
dt <- inaug_speeches_dt[, .(number_of_distinct_records = uniqueN(text)), by = Name]

##convert to kable tables
kable(dt, escape = F, "html", table.attr = "class='dtable'")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
Name number_of_distinct_records
George Washington 2
John Adams 1
Thomas Jefferson 2
James Madison 2
James Monroe 2
John Quincy Adams 1
Andrew Jackson 2
Martin Van Buren 1
William Henry Harrison 1
James Knox Polk 1
Zachary Taylor 1
Franklin Pierce 1
James Buchanan 1
Abraham Lincoln 2
Ulysses S. Grant 2
Rutherford B. Hayes 1
James A. Garfield 1
Grover Cleveland 2
Benjamin Harrison 1
William McKinley 2
Theodore Roosevelt 1
William Howard Taft 1
Woodrow Wilson 2
Warren G. Harding 1
Calvin Coolidge 1
Herbert Hoover 1
Franklin D. Roosevelt 4
Harry S. Truman 1
Dwight D. Eisenhower 2
John F. Kennedy 1
Lyndon Baines Johnson 1
Richard Milhous Nixon 2
Jimmy Carter 1
Ronald Reagan 2
George Bush 1
Bill Clinton 2
George W. Bush 2
Barack Obama 2
Donald J. Trump 1

2. Text Cleaning/Manipulation

https://stackoverflow.com/questions/39993715/how-to-remove-unicode-u00a6-from-string
##convert strings we will analyze to charachters
inaug_speeches_dt <- inaug_speeches_dt[, text:=as.character(text)]
inaug_speeches_dt <- inaug_speeches_dt[, text2:=as.character(text2)]
##combine text columns in r (only one that couldn't fit into one excel cell)
inaug_speeches_dt$text <- apply(inaug_speeches_dt[ , c("text", "text2") ] ,1 , paste , collapse = "" )


##create duplicate text column (for manipulation)
##trim whitespace and covert text to lowercase
inaug_speeches_dt$text_final <- trimws(inaug_speeches_dt$text)


##convert to regex and add additional text cleanin using stringr
##remove encoding errors using stringr (i.e. <U+AO97>) see <https://stackoverflow.com/questions/39993715/how-to-remove-unicode-u00a6-from-string>
inaug_speeches_dt$text_final <- gsub("\\s*<u\\+\\w+>\\s*", " ", inaug_speeches_dt$text_final)
inaug_speeches_dt$text_final <- gsub("\\s*<U\\+\\w+>\\s*", " ", inaug_speeches_dt$text_final)
##convert whitespace to single space
inaug_speeches_dt$text_final <- gsub("\\s", " ", inaug_speeches_dt$text_final)
##remove non-reg charachters
inaug_speeches_dt$text_final <- gsub("[^[A-Za-z0-9 ][:punct:]]", "", inaug_speeches_dt$text_final)




3. Date Formatting

convert dates using lubridate: https://www.rstudio.com/resources/cheatsheets/
date conventions: https://www.statmethods.net/input/dates.html
##load require packages
require(stringr)
require(lubridate)
require(tidyr)
require(dplyr)
##check unique 
#unique(inaug_speeches_dt$Date)

##make copy of original date format for manipulation
inaug_speeches_dt$DateOriginal <- inaug_speeches_dt$Date
##edit missmatched date format (Clinton 1997)
inaug_speeches_dt$Date <- gsub("20-Jan-97", "Monday, January 20, 1997", inaug_speeches_dt$Date)

## split current date formats
inaug_speeches_dt <- separate(inaug_speeches_dt, "Date", c('DayOfWeek', 'MonthDay', 'Year'), sep = ",") 

inaug_speeches_dt$MonthDay <- trimws(inaug_speeches_dt$MonthDay)
inaug_speeches_dt$Year <- trimws(inaug_speeches_dt$Year)

##replace NA Values in main date column with 
#inaug_speeches_dt$MonthDay <- ifelse(is.na(inaug_speeches_dt$MonthDay), inaug_speeches_dt$DayOfWeek, inaug_speeches_dt$MonthDay)

##recode factors for First Inaugural address
inaug_speeches_dt$Inaugural.Address <- recode_factor(inaug_speeches_dt$Inaugural.Address, "Inaugural Address" = "First Inaugural Address")

                
##replace non-day of the week with "unknown"
inaug_speeches_dt$DayOfWeek <- 
  ifelse(inaug_speeches_dt$DayOfWeek == "Monday", "Monday", 
         ifelse(inaug_speeches_dt$DayOfWeek == "Tuesday", "Tuesday", 
                ifelse(inaug_speeches_dt$DayOfWeek == "Wednesday", "Wednesday", 
                       ifelse(inaug_speeches_dt$DayOfWeek == "Thursday", "Thursday", 
                              ifelse(inaug_speeches_dt$DayOfWeek == "Friday", "Friday", 
                                     ifelse(inaug_speeches_dt$DayOfWeek == "Saturday", "Saturday", 
                                            ifelse(inaug_speeches_dt$DayOfWeek == "Sunday", "Sunday", "Unknown")))))))
##check unique days of week
dt <- inaug_speeches_dt[, .(number_of_distinct = uniqueN(text_final)), by = DayOfWeek] 

kable(dt, escape = F, "html", table.attr = "class='dtable'")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
DayOfWeek number_of_distinct
Thursday 9
Monday 18
Saturday 9
Wednesday 8
Tuesday 7
Friday 7
##concatenate DayMonth and Year
inaug_speeches_dt$Date <- apply(inaug_speeches_dt[, c('MonthDay', 'Year')], 1, paste, collapse = ",")
inaug_speeches_dt$Date <- gsub(",NA", "", inaug_speeches_dt$Date)

##check unique month/day
#unique(inaug_speeches_dt$MonthDay)

##multidate function in r
multidate <- function(data, formats){
  data <- gsub("nd", "", data, perl = TRUE)
  a<-list()
  for(i in 1:length(formats)){
    a[[i]]<- as.Date(data,format=formats[i])
    a[[1]][!is.na(a[[i]])]<-a[[i]][!is.na(a[[i]])]
  }
  a[[1]]
}

###converts disparate date types (list) to common format
inaug_speeches_dt$Date <- multidate(inaug_speeches_dt$Date, 
                                     c("%m/%d/%Y","%d.%m.%Y","%d %b %y", "%d %b %y", "%B %d, %Y",
                                       "%B %d %Y", "%d-%b-%y", "%m.%d.%Y", "%d %b, %Y", "%A, %m/%d/%y",
                                       "%A, %B %d, %Y", "%d of %B %Y", "%m%d%y", "%d%m%Y", "%d-%m-%Y",
                                       "%A %B %d %Y", "%d %B, %Y", "%d %B %Y", "%d %B %Y", "%B %d %Y",
                                       "%B %d, %Y", "%A %B %d, %Y", "%h %d %Y", "%m/%d%y", "%d/%m/%Y",
                                       "%d/%m/%Y", "%m%d%y", "%d %B %Y", "%B %d,%Y", "%b. %d,%Y",
                                       "%b-%d-%Y", "%m/%d.%Y"))

###change format to m/d/Y
inaug_speeches_dt$Date <- format(inaug_speeches_dt$Date, "%m/%d/%Y")
#unique(inaug_speeches_dt$Date)




4. Number of Charachters and Number of Words

##calculate number of charachters
inaug_speeches_dt$nchar <- nchar(inaug_speeches_dt$text_final)

##calculate number of words
nwords <- nwords <- function(string, pseudo=F){
  ifelse( pseudo, 
          pattern <- "\\S+", 
          pattern <- "[[:alpha:]]+" 
  )
  str_count(string, pattern)
}

##calculate number of words using nwords() function
inaug_speeches_dt$num_words <- nwords(inaug_speeches_dt$text_final)


##create new column for column labels for President
inaug_speeches_dt$PresidentNumber <- seq.int(nrow(inaug_speeches_dt))
inaug_speeches_dt$Speech <- apply(inaug_speeches_dt[, c('PresidentNumber', 'Name')], 1, paste, collapse = " ")
## Visualizing speech length

require(tidytext)
require(dplyr)
require(ggplot2)
##create barchart in ggplot

inaug_speeches_dt %>%
  unnest_tokens(word,text_final) %>%
  group_by(Speech) %>%
  summarise(num_words=n()) %>%
  mutate(mean_words=mean(num_words)) %>%
  ggplot(aes(x=Speech,y=(num_words)))+geom_bar(stat = "identity",width=0.5, 
                                               aes(fill = inaug_speeches_dt$Inaugural.Address)) +
  scale_fill_manual(values = c("red", "goldenrod", "blue", "light blue", "black")) +
  theme(axis.text.x = element_text(vjust=1,angle=90)) + theme(legend.position="bottom") +
  geom_text(aes(label=inaug_speeches_dt$Year), vjust=0,angle=90,size=2.5,hjust=0)+ylim(c(0,11500)) +
  labs(title="Speech Length",
       caption="United States: Inauguration Speeches")

##checkaverage speech length by inaigural address number
inaug_groups <-inaug_speeches_dt %>%
  group_by(Inaugural.Address) %>%
  summarise_at(vars(num_words), funs(mean(., na.rm=TRUE))) %>% 
  arrange(desc(num_words)) %>% 
  mutate(num_words = round(num_words, 2))


inaug_groups <- as.data.table(inaug_groups)

##convert to kable tables
kable(inaug_groups, escape = F, "html", table.attr = "class='dtable'")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
Inaugural.Address num_words
First Inaugural Address 2637.18
Second Inaugural Address 1836.82
Third Inaugural Address 1346.00
Fourth Inaugural Address 559.00
## subset columns
inaug_speeches_clean <- subset(inaug_speeches_dt, select = c("PresidentNumber", "Name","Inaugural.Address", 
                                                             "DayOfWeek", "Year", "Date", "DateOriginal", 
                                                             "text_final","nchar", "num_words", "Speech"))

##################################### Export Cleaned Data ##########################################
#setwd('C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks')
#write.csv(inaug_speeches_clean, "inaug_speeches_clean.csv")


5. N-Grams Frequency

require(RWeka)
require(quanteda)
require(tm)
require(data.table)

Adjust N-Gram selction (one or more) in this section

##check structure of the data
#str(inaug_speeches_clean)

###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)

###specify the text column to be used 
text <- inaug_speeches_clean$text_final

###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))

################################# tokenize using quanteda ##########################################
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
               remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(2, 3), concatenator = "_")


###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, 
                             pattern = c(paste0("^", stopwords("english"), "_"), 
                                         paste0("_", stopwords("english"), "$")), 
                             valuetype = "regex")

##rename rows of matrix to presidents names
row.names(dfm_toksNgrams) <- inaug_speeches_clean$Speech

##frequency dist
term_frequency <- colSums(dfm_toksNgrams)

# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing = TRUE)
term_frequency <- as.data.frame(term_frequency)

# extract column names (words/ngrams) to new column in r
setDT(term_frequency, keep.rownames = TRUE)[]
##                            rn term_frequency
##      1:         united_states            158
##      2:                let_us            101
##      3:       american_people             41
##      4:    federal_government             34
##      5:             men_women             28
##     ---                                     
## 121852: dream_waiting_hopeful              1
## 121853: waiting_hopeful_world              1
## 121854:     hopeful_world_god              1
## 121855:       world_god_bless              1
## 121856:         bless_may_god              1
# alternative method of extract column names (words/ngrams) to new column in r
# term_frequency$names <- rownames(term_frequency)
## write term frequency into new file (optional)

#setwd('C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks')
#write.csv(term_frequency, "inaug_term_frequency.csv")

Frequency Visualizations

require(ggplot2)
library(wordcloud)
##subset top 40 rows
term_frequency2 <- term_frequency %>% 
  arrange(desc(term_frequency)) %>%
  head(40)

## Plot a barchart of the most frequent words/phrases 
ggplot(term_frequency2, aes(x=reorder(rn, -term_frequency),y=(term_frequency))) + 
            geom_bar(stat = "identity",width=0.5, 
            aes(fill = term_frequency2$term_frequency)) +
            theme(axis.text.x = element_text(vjust=1,angle=90)) + theme(legend.position="none") +
            geom_text(aes(label=term_frequency), vjust=0,angle=90,size=2.5,hjust=0)+
            labs(title="Most Common Phrases", caption="United States: Inauguration Speeches")




6. Comparison Cloud


These comparison/commonality clouds compare the language used by the 57th (Obama II) and 58th
(Trump) Presidents. These visualizations can compare/contrast more than two texts, however.

Adjust N-Gram selction (one or more) in this section

##check structure of the data
#str(inaug_speeches_clean)

###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)

###specify the text column to be used 
text <- inaug_speeches_clean$text_final

###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))

################################# tokenize using quanteda ##########################################
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
               remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(1), concatenator = "_")


###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, 
                             pattern = c(paste0("^", stopwords("english"), "_"), 
                                         paste0("_", stopwords("english"), "$")), 
                             valuetype = "regex")

##rename rows of matrix to presidents names
row.names(dfm_toksNgrams) <- inaug_speeches_clean$Speech

##frequency dist
term_frequency <- colSums(dfm_toksNgrams)

# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing = TRUE)
term_frequency <- as.data.frame(term_frequency)

# extract column names (words/ngrams) to new column in r
setDT(term_frequency, keep.rownames = TRUE)[]
# alternative method of extract column names (words/ngrams) to new column in r
# term_frequency$names <- rownames(term_frequency)
convert from quanteda to tm: https://rdrr.io/cran/quanteda/man/convert.html>
##can compare 2 or more texts (president numbers)
dat <- dfm_toksNgrams[c(57, 58),]
##convert from quanteda dfm to tm DTM
dat <- convert(dat, to = "tm")
##convert from DTM to TDM
dat <- as.TermDocumentMatrix(dat)
dat <- as.matrix(dat)

##create clouds from wordcloud package
comparison.cloud(dat,max.words=80,random.order=FALSE,colors=c("#1F497D","#C0504D", "light blue"),
                                                  main="Differences Between Inauguration Speeches")

commonality.cloud(dat,random.order=FALSE,max.words=50, color="#1F497D",main="Commonalities in Inauguration Speeches")




7. Cosine Similarity


A measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. We can use this measure to gauge relative similarity of two texts.

Cosine similarity: https://www.youtube.com/watch?v=7cwBhWYHgsA
library(lsa)
require(ggplot2)
require(RWeka)
require(quanteda)
require(tm)
require(data.table)
#################################### Create tokens for cosine similarity ###########################

##check structure of the data
#str(inaug_speeches_clean)

###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)

###specify the text column to be used 
text <- inaug_speeches_clean$text_final

###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))

################################# tokenize using quanteda ##########################################
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
               remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(1), concatenator = "_")


###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, 
                             pattern = c(paste0("^", stopwords("english"), "_"), 
                                         paste0("_", stopwords("english"), "$")), 
                             valuetype = "regex")


#################################### Transform Matrix ##############################################


## (see https://www.youtube.com/watch?v=7cwBhWYHgsA)
##can compare 2 or more texts
CosineSim_dfm <- dfm_toksNgrams[c(1:nrow(dfm_toksNgrams)),]
##convert from quanteda dfm to tm DTM
CosineSim_dfm <- convert(CosineSim_dfm, to = "tm")

##check dimensions of dfm
dim(CosineSim_dfm)
## [1]   58 9195
## get Cosine similarity in transposed Matrix format
CosineSim_Matrix <- cosine(t(as.matrix(CosineSim_dfm)))
##check dimensions and transpose
dim(CosineSim_Matrix)
## [1] 58 58
##convert to data frame
CosineSim_Matrix <- as.data.frame(CosineSim_Matrix)

###################### join to primary dataset (optional) #########################################
##add cosine similarities to original text
## create unique ID to join Presdident Names to Cosine similarity text
CosineSim_Matrix$UniqueResp <- seq.int(nrow(CosineSim_Matrix))
inaug_speeches_clean$UniqueResp <- seq.int(nrow(inaug_speeches_clean))
##cartesian join
#inaug_speeches_wCosine <- merge(x = inaug_speeches_clean, y = CosineSim_Matrix, by = "UniqueResp", all.x = TRUE)

Cosine Similarity Visualization

require(ggcorrplot)
#################################### visualize cosine similarity matrix ############################
##Convert RowNames
##convert row and column names

CosineSim_Matrix <- as.data.frame(CosineSim_Matrix)
row.names(CosineSim_Matrix) <- inaug_speeches_clean$Speech
colnames(CosineSim_Matrix) <- t(inaug_speeches_clean$Speech)
##subset matrix of presidents
CosineSim_Matrix_1900 <- CosineSim_Matrix[c(30:32, 34:37, 41:42, 44:46, 48:49, 51, 52, 54, 56, 58), 
                                            c(30:32, 34:37, 41:42, 44:46, 48:49, 51, 52, 54, 56, 58)]

##correlation matrix plot
ggcorrplot(CosineSim_Matrix, type = "lower", lab = TRUE, legend.title = "Cos Sim", 
           title = "Cosine Similarity of Inauguration Speeches",
           show.diag = TRUE, outline.color = "black", lab_size = 2)

##correlation matrix plot for presidents 1st Inauguration speeches since 1900
ggcorrplot(CosineSim_Matrix_1900, type = "lower", lab = TRUE, legend.title = "Cos Sim", 
           title = "Cosine Similarity of Inauguration Speeches",
           show.diag = TRUE, outline.color = "black", lab_size = 2)

As the visualizations show, there seems to be similar language across the different speeches. Based on our phrase frequencies, we may start to form the idea that these speeches use similar language.




8. Lexical Diversity


Lexical Diversity refers to “the range of different words used in a text, with a greater range indicating a higher diversity” see https://rdrr.io/github/kbenoit/quanteda/man/textstat_lexdiv.html

require(RWeka)
require(quanteda)
require(tm)
##check structure of the data
#str(inaug_speeches_clean)

###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)

###specify the text column to be used 
text <- inaug_speeches_clean$text_final

###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))
###tokenize using quanteda

##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
               remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(1), concatenator = "_")


###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, 
                             pattern = c(paste0("^", stopwords("english"), "_"), 
                                         paste0("_", stopwords("english"), "$")), 
                             valuetype = "regex")

##calculate lexical diversity
dt <- round(textstat_lexdiv(dfm_toksNgrams, measure = c("all", "TTR", "C", "R", "CTTR", "U", "S",
                               "Maas"), log.base = 10), 3)

row.names(dt) <- inaug_speeches_clean$Speech


kable(dt, escape = F, "html", table.attr = "class='dtable'")  %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
TTR C R CTTR U S Maas lgV0 lgeV0
1 George Washington 0.771 0.960 19.813 14.010 70.461 0.961 0.119 9.665 22.254
2 George Washington 0.935 0.984 7.366 5.209 110.921 0.972 0.095 9.849 22.678
3 John Adams 0.654 0.939 21.410 15.139 49.774 0.943 0.142 8.282 19.071
4 Thomas Jefferson 0.722 0.951 20.674 14.618 60.006 0.953 0.129 9.006 20.737
5 Thomas Jefferson 0.670 0.942 21.365 15.107 52.038 0.946 0.139 8.457 19.473
6 James Madison 0.825 0.969 19.033 13.458 89.046 0.969 0.106 10.762 24.780
7 James Madison 0.805 0.966 18.864 13.339 79.716 0.965 0.112 10.178 23.436
8 James Monroe 0.561 0.921 22.290 15.761 40.726 0.930 0.157 7.587 17.470
9 James Monroe 0.541 0.919 24.286 17.173 40.893 0.930 0.156 7.713 17.761
10 John Quincy Adams 0.634 0.937 23.479 16.602 49.684 0.943 0.142 8.405 19.353
11 Andrew Jackson 0.797 0.964 18.582 13.139 76.076 0.964 0.115 9.922 22.847
12 Andrew Jackson 0.708 0.945 16.682 11.796 50.241 0.944 0.141 7.959 18.326
13 Martin Van Buren 0.619 0.937 27.471 19.425 52.113 0.945 0.139 8.820 20.308
14 William Henry Harrison 0.445 0.902 27.518 19.458 36.509 0.919 0.166 7.479 17.221
15 James Knox Polk 0.508 0.912 24.283 17.171 38.380 0.924 0.161 7.491 17.249
16 Zachary Taylor 0.783 0.961 18.000 12.728 69.674 0.960 0.120 9.453 21.765
17 Franklin Pierce 0.632 0.938 25.354 17.928 51.607 0.945 0.139 8.667 19.956
18 James Buchanan 0.583 0.925 21.617 15.286 42.066 0.932 0.154 7.662 17.642
19 Abraham Lincoln 0.540 0.917 22.061 15.600 38.817 0.926 0.161 7.407 17.055
20 Abraham Lincoln 0.776 0.956 14.284 10.100 58.070 0.952 0.131 8.288 19.085
21 Ulysses S. Grant 0.703 0.944 16.595 11.734 49.195 0.943 0.143 7.871 18.124
22 Ulysses S. Grant 0.705 0.946 17.588 12.437 51.368 0.946 0.140 8.122 18.702
23 Rutherford B. Hayes 0.595 0.927 20.623 14.583 42.047 0.932 0.154 7.598 17.495
24 James A. Garfield 0.602 0.930 23.023 16.280 45.488 0.937 0.148 8.035 18.501
25 Grover Cleveland 0.702 0.947 19.993 14.137 55.155 0.949 0.135 8.598 19.797
26 Benjamin Harrison 0.562 0.925 25.930 18.335 44.231 0.935 0.150 8.088 18.623
27 Grover Cleveland 0.716 0.952 22.662 16.025 62.123 0.955 0.127 9.301 21.416
28 William McKinley 0.555 0.922 24.628 17.415 42.480 0.932 0.153 7.870 18.120
29 William McKinley 0.659 0.940 21.758 15.385 50.986 0.945 0.140 8.401 19.344
30 Theodore Roosevelt 0.687 0.939 14.597 10.322 43.268 0.935 0.152 7.224 16.635
31 William Howard Taft 0.486 0.908 24.844 17.567 37.290 0.922 0.164 7.422 17.090
32 Woodrow Wilson 0.672 0.941 19.161 13.549 49.114 0.943 0.143 8.073 18.588
33 Woodrow Wilson 0.648 0.934 16.872 11.930 42.590 0.934 0.153 7.371 16.973
34 Warren G. Harding 0.591 0.930 24.605 17.398 45.951 0.938 0.148 8.163 18.796
35 Calvin Coolidge 0.558 0.923 24.318 17.195 42.416 0.932 0.154 7.847 18.069
36 Herbert Hoover 0.520 0.913 22.130 15.649 37.408 0.923 0.163 7.286 16.777
37 Franklin D. Roosevelt 0.668 0.941 20.429 14.445 50.335 0.944 0.141 8.260 19.019
38 Franklin D. Roosevelt 0.638 0.934 19.468 13.766 45.166 0.938 0.149 7.779 17.912
39 Franklin D. Roosevelt 0.678 0.940 16.894 11.946 46.214 0.939 0.147 7.665 17.649
40 Franklin D. Roosevelt 0.768 0.953 12.456 8.808 51.099 0.945 0.140 7.581 17.456
41 Harry S. Truman 0.563 0.919 19.258 13.617 37.714 0.924 0.163 7.135 16.428
42 Dwight D. Eisenhower 0.626 0.934 21.905 15.489 46.829 0.939 0.146 8.077 18.599
43 Dwight D. Eisenhower 0.623 0.930 17.882 12.645 41.404 0.932 0.155 7.352 16.930
44 John F. Kennedy 0.646 0.933 17.123 12.108 42.681 0.934 0.153 7.399 17.037
45 Lyndon Baines Johnson 0.624 0.928 16.586 11.728 39.610 0.929 0.159 7.101 16.350
46 Richard Milhous Nixon 0.595 0.925 19.241 13.606 40.453 0.930 0.157 7.370 16.970
47 Richard Milhous Nixon 0.485 0.893 14.307 10.117 27.500 0.895 0.191 5.836 13.438
48 Jimmy Carter 0.689 0.942 16.848 11.913 47.651 0.941 0.145 7.774 17.900
49 Ronald Reagan 0.627 0.934 21.553 15.240 46.553 0.939 0.147 8.032 18.495
50 Ronald Reagan 0.580 0.924 20.978 14.834 41.045 0.931 0.156 7.535 17.350
51 George Bush 0.577 0.922 19.256 13.616 38.843 0.927 0.160 7.233 16.654
52 Bill Clinton 0.643 0.934 18.160 12.841 43.889 0.936 0.151 7.579 17.451
53 Bill Clinton 0.538 0.912 18.418 13.024 35.016 0.918 0.169 6.838 15.746
54 George W. Bush 0.616 0.928 17.584 12.434 40.269 0.930 0.158 7.235 16.658
55 George W. Bush 0.597 0.926 19.569 13.837 41.050 0.931 0.156 7.443 17.139
56 Barack Obama 0.662 0.942 22.969 16.242 52.968 0.947 0.137 8.634 19.880
57 Barack Obama 0.639 0.936 20.858 14.749 47.106 0.940 0.146 8.032 18.494
58 Donald J. Trump 0.595 0.922 16.560 11.709 37.004 0.923 0.164 6.876 15.833
##calculate create new column with chosen lexical diversity  
inaug_speeches_clean$LexDiv_r <- textstat_lexdiv(dfm_toksNgrams, measure = c("CTTR"), log.base = 10)

Visualize Lexical Diversity

require(ggplot2)
require(plotly)

Lexical Diversity over time

##convert strings we will visualize to numeric
inaug_speeches_clean <- inaug_speeches_clean[, Year:=as.numeric(Year)]

## look at number of Inaugurations by Address Number
dt <- inaug_speeches_clean[, .(number_of_distinct = uniqueN(PresidentNumber)), by = Inaugural.Address]

kable(dt, escape = F, "html", table.attr = "class='dtable'") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
Inaugural.Address number_of_distinct
First Inaugural Address 39
Second Inaugural Address 17
Third Inaugural Address 1
Fourth Inaugural Address 1
###Time Series Plot 
##subset
first_inaug <- 
  inaug_speeches_clean %>%
  filter(Inaugural.Address == "First Inaugural Address")
##create plot
LexDiv_plot <- ggplot(data=first_inaug, aes(x=Year, y=LexDiv_r)) +
  geom_line(colour = "blue", linejoin = "mitre")+
  geom_point(colour = "Black") +  
  theme(axis.text.x = element_text(vjust=1,angle=90)) +
  geom_text(aes(label=Name), vjust=0,angle=90,size=2.5,hjust=0) +
  scale_x_continuous(breaks = seq(1789,2017,4))
##show plot
LexDiv_plot

Lexical Diversity in plotly

##convert and show in plotly
LexDiv_plotly <- ggplotly(LexDiv_plot)
LexDiv_plotly

Scatter Plot of Lexical Diversity & Length of Speech

##################### scatter plot
##scatter plot of number of words vs Lexical Diversity Measure
scat <- ggplot(inaug_speeches_clean, aes(x = num_words, y = LexDiv_r)) +
  geom_point(aes(color = Year))

##convert to plotly and visualize
scat <- ggplotly(scat)
scat

We find that there have been 17 presidents that have been elected for a second term. We also see thaere does not appear to be a trend in the variety of language used over the years (based on CTTR measure of Lexical Doversity). Upon further examination, we see that the Lexical Diversity measure we used may be biased towards longer responses.




9. Name Entity Recognition (NER)

require(rJava)
require(NLP)
require(openNLP)
require(data.table)
require(dplyr)
##load in data
#inaug_speeches_clean <- read.csv("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks/inaug_speeches_clean.csv")

###convert raw data to data.table 
data <- as.data.table(inaug_speeches_clean)

###convert column(s) with text to be anaalyzed to charachter
data <- data[, text_final:=as.character(text_final)]

###to subset data with less than certain number of charachters (optional)
#data$NumChar <- nchar(data$TextColumn)
#data <- data[NumChar > 3, ]

###check for unique responses to subset by (optional)
#unique(data$ColumnWithFactors)
#data_subset <- data[which(data$ColumnWithFactors == 'Particular Response'),]

##remove NAs
data <- data[!(is.na(data$text_final) | data$text_final==""), ]

#select only text column for analysis
names(inaug_speeches_clean)
##  [1] "PresidentNumber"   "Name"              "Inaugural.Address"
##  [4] "DayOfWeek"         "Year"              "Date"             
##  [7] "DateOriginal"      "text_final"        "nchar"            
## [10] "num_words"         "Speech"            "UniqueResp"       
## [13] "LexDiv_r"
##subset to first inauguration speeches only
##feel free to adjust the exact subset you use
inaug_speeches_first <- inaug_speeches_clean[which(inaug_speeches_clean$Inaugural.Address == 'First Inaugural Address'),]

text <- inaug_speeches_first$text_final
###for tagging
text <- as.String(text)
require(magrittr)
require(openNLPmodels.en)
require(rJava)
require(NLP)
require(openNLP)
require(data.table)
library(dplyr)
##increase memory limit ~ 7.2gb
memory.limit(size=56500)
##extract words
word_ann <- Maxent_Word_Token_Annotator()  
##extract dentences
sent_ann <- Maxent_Sent_Token_Annotator()  
pos_ann <- Maxent_POS_Tag_Annotator()

pos_annotations <- annotate(text, list(sent_ann, word_ann, pos_ann))
text_annotations <- annotate(text, list(sent_ann, word_ann))

#head(text_annotations)
text_doc <- AnnotatedPlainTextDocument(text, text_annotations)
#words(text_doc) %>% head(10)

person_ann <- Maxent_Entity_Annotator(kind = "person")
location_ann <- Maxent_Entity_Annotator(kind = "location")
organization_ann <- Maxent_Entity_Annotator(kind = "organization")
date_ann <- Maxent_Entity_Annotator(kind = "date")

pipeline <- list(sent_ann,
                 word_ann,
                 person_ann,
                 location_ann,
                 organization_ann,
                 date_ann)

text_annotations <- annotate(text_doc, pipeline)
text_doc <- AnnotatedPlainTextDocument(text, text_annotations)


entities <- function(doc, kind){
  s <- doc$content
  a <- annotations(doc)[[1]]
  if(hasArg(kind)) {
    k <- sapply(a$features, '[[', "kind")
    s[a[k == kind]]
  } else {
    a[s[a$tpe == "entity"]]
  }
}

person <- as.data.table(entities(text_doc, kind = "person"))
location <- as.data.table(entities(text_doc, kind = "location"))
organization <- as.data.table(entities(text_doc, kind = "organization"))
date <- as.data.table(entities(text_doc, kind = "date"))            


##visualize with googleVis
#library(googleVis)
#dfl <- data.frame(table(entities(text_doc, kind = "location")))
#Barp <- gvisColumnChart(dfl)
#plot(Barp)



##Convert NER terms to new data frame with frequency 


##################################### location  ############################################
location$X <- seq.int(nrow(location))
location$NER_type <- "location"
location_tibble <- location %>% 
  group_by(V1) %>% 
  mutate(Count=n_distinct(X)) %>% 
  arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
location_final <- location_tibble[!duplicated(location_tibble$V1), c(1,3:4)]


#################################### person  ############################################
person$X <- seq.int(nrow(person))
person$NER_type <- "person"
person_tibble <- person %>% 
  group_by(V1) %>% 
  mutate(Count=n_distinct(X)) %>% 
  arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
person_final <- person_tibble[!duplicated(person_tibble$V1), c(1, 3:4)]


################################## organization #########################################
organization$X <- seq.int(nrow(organization))
organization$NER_type <- "organization"
organization_tibble <- organization %>% 
  group_by(V1) %>% 
  mutate(Count=n_distinct(X)) %>% 
  arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
organization_final <- organization_tibble[!duplicated(organization_tibble$V1), c(1, 3:4)]


################################# date ##################################################
date$X <- seq.int(nrow(date))
date$NER_type <- "date"
date_tibble <- date %>% 
  group_by(V1) %>% 
  mutate(Count=n_distinct(X)) %>% 
  arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
date_final <- date_tibble[!duplicated(date_tibble$V1), c(1, 3:4)]

##rbind columns
NER_combined <- rbind(organization_final, date_final, person_final, location_final)


NER: Frequency Visualizations

require(ggplot2)
require(wordcloud)
##subset top 40 most frequent terms using dplyr pipes 
NER_combined2 <- NER_combined %>% 
  arrange(desc(Count)) %>%
  head(40) 

ggplot(NER_combined2, aes(x=reorder(V1, -Count),y=(Count))) + 
  geom_bar(stat = "identity",width=0.5, 
           aes(fill = NER_combined2$NER_type)) +
  theme(axis.text.x = element_text(vjust=1,angle=90)) + theme(legend.position="bottom") +
  scale_fill_manual(values = c("red", "goldenrod", "blue", "light blue", "black")) +
  geom_text(aes(label=Count), vjust=0,angle=90,size=2.5,hjust=0)+
  labs(title="NER: Frequencies", caption="United States: Inauguration Speeches")

NER: Dictionary Creation

The creation of this dictionary will allow us to compare to individual responses, as the NER model takes in free text as one chunk to compare to it’s machine-trained model.

############################## NER: Dictionary Creation ############################################
###export NER terms for further analysis
setwd("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks")
write.csv(NER_combined, 'NER_combined.CSV')

###################################### NER Term List #############################################
##returns lists of custom bad words dictionary in r
NER_combined <- read.csv("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks/NER_combined.CSV")
NER_combined <- as.data.table(NER_combined)

##convert needed text columns to charachters
#str(NER_combined)
##rename column with words
colnames(NER_combined)[colnames(NER_combined) == 'V1'] <- 'NER_terms'

##convert NER terms from factors to charachters
NER_combined <- NER_combined[, NER_terms:=as.character(NER_terms)]
##trim whitespace and convert to lowercase
NER_combined$NER_terms <- trimws(tolower(NER_combined$NER_terms))
##replace spaces with underscores
NER_combined$NER_terms <- gsub(" ", "_", NER_combined$NER_terms)
NER_terms <- as.data.table(NER_combined$NER_terms)

### transform list of bad words to custome dictionary format ###
NER_terms <- as.data.table(NER_combined$NER_terms)
NER_dictionary <- dictionary(as.list(NER_terms))
############ apply custome dictionary and output words occuring in new column ######################
## Tokenize, convert to DFM, select N-Grams, and clean

###tokenization packages 
require(tibble)
require(quanteda)
require(tm)
require(stringr)
library(data.table)

##increase memory limit (~7.2GB)
memory.limit(size=56500)

################################## Text Preprocessing Pipeline #####################################

###create custome stopwords list
custom_stopwords <- (c(stopwords("english"), "additional", "stopwords"))

##tokenization and dfm (data frame matrix) transformation function
tokenize_dfm <- function(x) {tokenize <- tokens(x, remove_punct = TRUE,
                                                remove_symbols = TRUE)
tok_lower <- tokens_tolower(tokenize)
#tok_stem <- tokens_wordstem(tok_lower, language = quanteda_options("language_stemmer"))
toksNgrams <- tokens_ngrams(tok_lower, n = c(1, 2, 3), concatenator = "_") ###specify n-grams
dfm_toksNgrams <- dfm(toksNgrams)
#dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, custom_stopwords)
## keeps only words occuring in "bad words" dictionary
dfm_toksNgrams <- dfm_select(dfm_toksNgrams, NER_dictionary, selection = "keep")
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, 
                             pattern = c(paste0("^", stopwords("english"), "_"), 
                                         paste0("_", stopwords("english"), "$")), 
                             valuetype = "regex")
rowTotals <- apply(dfm_toksNgrams, 1, sum) ###remove rows with no words
tokens_dfm_new   <- dfm_toksNgrams[rowTotals> 0, ]}

###run select data through tokenization/cleaning function
FreeText_tokens_dfm <- tokenize_dfm(inaug_speeches_clean$text_final)

###returns matrix dimensions and sparsity
FreeText_tokens_dfm
###view top terms 
head(featnames(FreeText_tokens_dfm))


##returns overall counts of word occurnences
textstat_frequency(FreeText_tokens_dfm)

####transform dfm to data frame
FreeText_tokens_df <- convert(FreeText_tokens_dfm, "data.frame")

FreeText_tokens_df[FreeText_tokens_df == 0] <- FALSE
FreeText_tokens_df[FreeText_tokens_df != 0] <- TRUE

#wc <- droplevels(col(FreeText_tokens_df, as.factor=TRUE)[which(FreeText_tokens_df != NA)])
#FreeText_tokens_df[levels(wc)] <- Map(factor, FreeText_tokens_df[levels(wc)], labels = levels(wc))
#FreeText_tokens_df

##change True/False to column names (words/phrases)
w <- which(FreeText_tokens_df==TRUE,arr.ind=TRUE)
FreeText_tokens_df[w] <- names(FreeText_tokens_df)[w[,"col"]]

##create new column that concatenates all columns (with occurences of words) by individual response
FreeText_tokens_df$NER_terms <- apply(FreeText_tokens_df[, 1:ncol(FreeText_tokens_df)], 1, paste, collapse = ",")

##remove whitespace
#trimws(FreeText_tokens_df$NER_terms)

##Remove "0s" (that represent blanks) in concatenated column
FreeText_tokens_df$NER_terms <- gsub("0,", "", FreeText_tokens_df$NER_terms)
FreeText_tokens_df$NER_terms <- gsub("0", "", FreeText_tokens_df$NER_terms)

##Remove last character "," at end of string
FreeText_tokens_df$NER_terms <- substr(FreeText_tokens_df$NER_terms,1,nchar(FreeText_tokens_df$NER_terms)-1)
dim(FreeText_tokens_df)

##rearrange columns to have common delimited list first
FreeText_tokens_df <- FreeText_tokens_df[c(ncol(FreeText_tokens_df), 1:(ncol(FreeText_tokens_df)-1))]
## check to make necessary columns remain
names(FreeText_tokens_df)

## create new df with list of NER terms
text_NERterms <- FreeText_tokens_df[c(1)]

### count number of bad words occuring in new column
text_NERterms$count_NERterms <- str_count(text_NERterms$NER_terms, ",") + 1

##join to feebackCombined
setDT(text_NERterms, keep.rownames = TRUE)[]
text_NERterms$rn <- gsub("[A-Za-z]", "", text_NERterms$rn)

text_NERterms <- text_NERterms[, rn:=as.integer(rn)]

################################ merge bad words to complete set ##################################
merge_doc <- function(x, y) {merge(x, y, 
                                   by.x="PresidentNumber", by.y="rn", all.x = TRUE)}

feedbackCombined_NERterms <- merge_doc(inaug_speeches_clean, text_NERterms)
#str(feedbackCombined_NERterms)

##duplicate list to use for tooltip
feedbackCombined_NERterms$NER_terms_list <- feedbackCombined_NERterms$NER_terms

### count number of bad words occuring in new column
feedbackCombined_NERterms$count_NERterms <- str_count(feedbackCombined_NERterms$NER_terms, ",") + 1

##create placeholder for responses without at least 1 bad word
feedbackCombined_NERterms$NER_terms[which(is.na(feedbackCombined_NERterms$NER_terms))] <- "none"
feedbackCombined_NERterms$NER_terms[which(feedbackCombined_NERterms$NER_terms == c(""))] <- "none"




10. Transofrmation of NER List for Tableau


This transformation will allow us to filter by indvidual words.

require(reshape)
require(tidyr)
##seperate NER terms into multiple new columns
feedbackCombined_NERterms <- separate(feedbackCombined_NERterms, "NER_terms", c(paste0("NER_terms",1:60)), 
                                      sep = ",") 

##convert new split columns to data.table
feedbackCombined_NERterms <- as.data.table(feedbackCombined_NERterms)
#names(feedbackCombined_NERterms)

##melt split out columns
feedbackCombined_NERterms <- melt(feedbackCombined_NERterms, measure.vars = c(paste0("NER_terms",1:60)), 
                                  value.name = "NER_term")
##remove NAs
feedbackCombined_NERterms <- feedbackCombined_NERterms[!is.na(feedbackCombined_NERterms$NER_term)]
##check column names
names(feedbackCombined_NERterms)
##  [1] "PresidentNumber"   "Name"              "Inaugural.Address"
##  [4] "DayOfWeek"         "Year"              "Date"             
##  [7] "DateOriginal"      "text_final"        "nchar"            
## [10] "num_words"         "Speech"            "UniqueResp"       
## [13] "LexDiv_r"          "count_NERterms"    "NER_terms_list"   
## [16] "variable"          "NER_term"
##check for unique counts
feedbackCombined_NERterms[, .(number_of_distinct = uniqueN(PresidentNumber)), by = NER_term]
##subset columns




Final Data Export

########################### Final data export ##################################################### 
setwd('C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks')
write.csv(feedbackCombined_NERterms, "feedbackCombined_NERterms.csv")